Normalize the Seattle Data Set


In [7]:
import json
import pandas
import datetime
import numpy as np

In [3]:
with open('mags-97de.json') as fd:
    d = json.load(fd)

column_names = []
for c in d['meta']['view']['columns']:
    column_names.append(c['name'])

df = pandas.DataFrame(d['data'], columns=column_names)

In [4]:
df.columns


Out[4]:
Index(['sid', 'id', 'position', 'created_at', 'created_meta', 'updated_at',
       'updated_meta', 'meta', 'Application/Permit Number', 'Permit Type',
       'Address', 'Description', 'Category', 'Action Type', 'Work Type',
       'Value', 'Applicant Name', 'Application Date', 'Issue Date',
       'Final Date', 'Expiration Date', 'Status', 'Contractor',
       'Permit and Complaint Status URL', 'Master Use Permit', 'Latitude',
       'Longitude', 'Location'],
      dtype='object')

In [5]:
len(df)


Out[5]:
48480

In [10]:
df = df[np.asarray(df['Issue Date'], str) != 'None']
len(df)


Out[10]:
32840

In [11]:
def strptime(s):
    return datetime.datetime.strptime(s[:10], '%Y-%m-%d').timestamp()

df['Issue Date'] = df['Issue Date'].apply(strptime)

In [13]:
df['target'] = ((df['Permit Type'] == 'Construction') & (df['Action Type'] == 'NEW'))

In [14]:
# These are at least theoretically helpful columns
useful_columns = [
    'Issue Date', 'target', 'Value', 'Category', 'Work Type']

df = df[useful_columns]

In [15]:
df.to_csv('seattle.csv')

In [ ]: